use "${data}\source\Cifre_appended_0318_complete", clear
	drop if mi(datedeffet)

	rename siretdelentreprise siret

	rename dénominationdulaboratoire 	lab_name
	rename raisonsocialedelentreprise 	firm_name
	rename nomdelinstitutionderattache 	tutelle_name 
	rename codenafdelentreprise 	naf
	rename ndéquipedulaboratoire 		teamnumber_lab

	rename codepostaldelentreprise zipcode_firm
	rename codepostaldeladressedulabo zipcode_lab
	foreach cp of varlist codepostal* villede* {
		replace `cp'  = trim(stritrim(`cp'))
		}
	replace zipcode_firm = codepostaldelentreprisedem if mi(zipcode_firm) & !mi(codepostaldelentreprisedem)
	replace zipcode_firm = codepostaldeladressepersonn if mi(zipcode_firm) & !mi(codepostaldeladressepersonn)

	gen libcom = villedelentreprise
	replace libcom = villedelentreprisedembauche if mi(libcom)
	replace libcom = villedeladressepersonnelled if mi(libcom)
	replace libcom = subinstr(libcom,"-"," ",.)
	replace libcom = upper(ustrlower(ustrregexra(ustrnormalize(libcom,"nfd"),"\p{Mark}","")))
	egen libcom2 = sieve(libcom), keep(alphabetic space)
	drop libcom
	ren libcom2 libcom
	replace libcom = subinstr(libcom,"CEDEX","",.)
	replace libcom = " " + libcom + " "
	replace libcom = subinstr(libcom," ST "," SAINT ",.)
	replace libcom = trim(stritrim(libcom))


	gen date_start = date(datedeffet,"DMY")
	gen date_start2 = date(datedeffet,"YMD")
	replace date_start = date_start2 if mi(date_start)
	drop date_start2
	format date_start %td 


	gen date_end = date(datedefin,"DMY")
	gen date_end2 = date(datedefin,"YMD")
	replace date_end = date_end2 if mi(date_end)
	drop date_end2
	format date_end %td 

	replace year = yofd(date_end) - 3  if yofd(date_start)==1900
	replace year = 2003 if year<2003
	gen year_end = yofd(date_end)

	foreach var of varlist _all {
		count if mi(`var')
		if(`r(N)'==_N) drop `var'
	}

	rename autredisciplinescientifique 		fieldbis
	rename libellédelautredisciplinesc 		fieldbis_label

	rename disciplinescientifique			field
	rename libellédeladisciplinescienti  	field_label



	keep ndossier siren siret lab_name firm_name tutelle_name teamnumber_lab naf zipcode* libcom year field* salaire

	keep if !mi(siren)

cd "${tmp}"

order siren siret  firm_name year 

foreach var of varlist _all {
	cap replace `var' = trim(stritrim(`var'))
	}
egen naf2 = sieve(naf), char(1 2 3 4 5 6 7 8 9 0 A B C Z)
replace naf2 = trim(stritrim(naf2))
replace naf = naf2 if strlen(naf) > 7 & strlen(naf2) == 5
replace naf = substr(naf2,1,5) if strlen(naf) > 10 & strlen(naf2) <= 7
replace naf = "" if substr(naf,1,3) == "000"
replace naf = subinstr(naf,"-","",.)
replace naf = subinstr(naf,".","",.)
replace naf = subinstr(naf," ","",.)
replace naf = subinstr(naf,"NAF","",.)
replace naf = subinstr(naf,"APE","",.)
replace naf = subinstr(naf,"NAS","",.)
replace naf = subinstr(naf,"DSB","",.)
replace naf = substr(naf,strpos(naf,"Z")-4,5) if regexm(naf,"Z") & strlen(naf) > 5
drop naf2
replace naf = upper(naf)
bys siren: egen naf_mode = mode(naf) if strlen(naf) == 5, minmode
bys siren (naf_mode): replace naf_mode = naf_mode[_N]
replace naf = naf_mode if !mi(naf_mode)
drop naf_mode
gen naf_rev1 = naf if strlen(naf) == 4
destring siren, replace force
merge m:1 siren using "${data}/Utils/Siren_NAF_rev12", nogen keep(1 3) keepusing(naf_rev2)
merge m:1 naf_rev1 using "${data}/Utils/NAF_rev12_impute", nogen keep(1 3 4 5) update  keepusing(naf_rev2)
replace naf = naf_rev2 if strlen(naf) < 5
drop naf_rev?
replace naf = substr(naf,1,5)
replace naf = "" if real(substr(naf,1,4)) == . | real(substr(naf,5,1)) != .
**-----
ren zipcode_firm code_postal
egen cp2 = sieve(code_postal), keep(numeric)
replace code_postal = cp2 if real(code_postal) == .
drop cp2
gen dep = substr(code_postal,1,2)
destring code_postal, replace
*joinby code_postal libcom using "${intpath}/Codes_postaux", unm(master)
preserve
use "${data}/Utils/depcom_ZE_mapping", clear
replace libcom = subinstr(libcom,"-"," ",.)
replace libcom = upper(ustrlower(ustrregexra(ustrnormalize(libcom,"nfd"),"\p{Mark}","")))
egen libcom2 = sieve(libcom), keep(alphabetic space)
drop libcom
ren libcom2 libcom
replace libcom = subinstr(libcom,"CEDEX","",.)
replace libcom = trim(stritrim(libcom))
gduplicates drop
drop if mi(ZE2010)
gen dep = substr(string(depcom, "%02.0f"),1,2)
keep dep libcom ZE2010
gduplicates drop
gduplicates tag dep libcom, gen(dup)
drop if dup != 0
drop dup
tempfile dp
save `dp'
gen sound = soundex(libcom)
tempfile dp3
save `dp3'
drop sound
gduplicates tag libcom, gen(dup)
drop if dup != 0
drop dup
tempfile dp2
save `dp2'
replace libcom = subinstr(libcom," ","",.)
gduplicates tag libcom, gen(dup)
drop if dup != 0
drop dup
tempfile dp4
save `dp4'
gen sound = soundex(libcom)
tempfile dp5
save `dp5'
restore
drop if real(dep) >= 97
merge m:1 dep libcom using `dp', keep(1 3) nogen
merge m:1 libcom using `dp2', keep(1 3 4 5) nogen update
merge m:1 libcom using `dp4', keep(1 3 4 5) nogen update
preserve
keep if mi(ZE2010)
keep libcom dep
gduplicates drop
gen sound = soundex(libcom)
ren libcom libcom_orig
joinby sound dep using `dp3', unm(master)
jarowinkler libcom_orig libcom, gen(score)
bys libcom_orig (score): keep if _n == _N
keep if score >= 0.8
keep libcom_orig ZE2010
ren libcom_orig libcom
gduplicates drop
tempfile jaro1
save `jaro1'
restore
preserve
keep if mi(ZE2010)
keep libcom dep
gduplicates drop
gen sound = soundex(libcom)
ren libcom libcom_orig
joinby sound using `dp5', unm(master)
jarowinkler libcom_orig libcom, gen(score)
bys libcom_orig (score): keep if _n == _N
keep if score >= 0.9
keep libcom_orig ZE2010
ren libcom_orig libcom
gduplicates drop
tempfile jaro2
save `jaro2'
restore
merge m:1 libcom using `jaro1', nogen keep(1 3 4 5) update
merge m:1 libcom using `jaro2', nogen keep(1 3 4 5) update
replace ZE2010 = "5201" if regexm(libcom,"ANCENIS")
replace ZE2010 = "9305" if regexm(libcom,"SOPHIA ANTIPOLIS")
replace ZE2010 = "1101" if regexm(libcom,"PARIS") & regexm(libcom,"DEFENS") | libcom == "LA DEFENSE" | libcom == "LA PLAINE SAINT DENIS"
replace ZE2010 = "1102" if regexm(libcom,"MARN") & regexm(libcom,"VALLEE")
replace ZE2010 = "1107" if regexm(libcom,"MORET") & regexm(libcom,"LOING")
replace ZE2010 = "1119" if regexm(libcom,"CERGY PONTOISE")
bys dep: egen mod_ze = mode(ZE2010)
replace ZE2010 = mod_ze if mi(ZE2010)
compress
save "${tmp}/Cifre_0318_NAF_ZE_clean", replace


use "${tmp}/Cifre_0318_NAF_ZE_clean", clear
gen double id = real(subinstr(ndossier,"/","",.))
gen year_end = year + 3
expand year_end - year + 1
ren year year_start
bys id: gen year = year_start + _n - 1
bys id: gen new = (year==year_start)
bys id year: gen active = (_n == 1)
bys siren ZE2010 year_start (year): gen firmnew = (_n == 1 & year==year_start) if !mi(siren ) & siren != 0
bys siren ZE2010 year: gen firmactive = (_n == 1) if !mi(siren ) & siren != 0
gcollapse (sum) nb_newcifre = new nb_activecifre = active nb_siren_newcifre = firmnew nb_siren_activecifre = firmactive salaire_cifre = salaire, by(naf ZE2010 year)
ren naf ape
drop if mi(ape) | mi(ZE2010)
keep if inrange(year,2004,2018)
save "${tmp}/Cifre_0318_NAF_ZE_collapse", replace